{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import pandas as pd\n",
    "import json\n",
    "from parsel import Selector\n",
    "import re\n",
    "import time\n",
    "import datetime\n",
    "from tqdm import tqdm\n",
    "pd.set_option('display.max_columns', None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "要获取的： 上次开播\n",
    "订阅数：\n",
    "公告：\n",
    "工会\n",
    "\n",
    "a：\n",
    "    是否开启摄像头\n",
    "\n",
    "b\n",
    "贡献榜：是否超过50人 ，总多少\n",
    "粉丝榜："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "code_folding": [
     0
    ]
   },
   "outputs": [],
   "source": [
    "# 标签说明  \n",
    "#- gameFullName:     游戏名称\n",
    "# - gameHostName:     \n",
    "# - boxDatainfo:     \n",
    "# - totalCount:       人气值\n",
    "# - roomName:         直播间名\n",
    "# - bussType:       \n",
    "# - screenshot:       直播截图\n",
    "# - privateHost:      初始id？\n",
    "# - nick:             昵称\n",
    "# - avatar180:        头像\n",
    "# - gid:          \n",
    "# - introduction:     简介\n",
    "# - recommendStatus:  推荐类型ID\n",
    "# - recommendTagName: 推荐类型名称\n",
    "# - isBluRay:         是否蓝光\n",
    "# - bluRayMBitRate:   蓝光清晰度\n",
    "# - screenType:       截屏类型\n",
    "# - liveSourceType:   分区？？（一起玩、女神、云顶之弈...）\n",
    "# - uid:              用户ID\n",
    "# - channel:          工会id\n",
    "# - livechannel:      工会id\n",
    "# - imgRecInfo:\n",
    "# - aliveNum:         \n",
    "# - attribute:         使用英雄、是否上电视？\n",
    "# - profileRoom:       房间号,可直接用于网址搜索房间\n",
    "# - isRoomPay:         是否为付费房间\n",
    "# - roomPayTag:        \n",
    "# - isWatchTogetherVip: 是否为一起看vip房间\n",
    "  \n",
    "  \n",
    "  \n",
    "# - 主播动态视频\n",
    "# - 公告栏\n",
    "# - 周榜信息\n",
    "# - 等级\n",
    "# - 个人信息\n",
    "#     - 订阅\n",
    "#     - 性别\n",
    "#     - "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "ht = pd.read_csv('4月4日数据_已提取标签.csv')  # 主播信息表     ht   The host tabel\n",
    "pht = pd.read_csv('主播个人信息.csv')          # 主播个人信息表 pht   personal host tabel\n",
    "pl = pd.read_csv('房间信息.csv')\n",
    "wc = pd.read_csv('周贡献榜.csv')   # 周贡献榜"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "###### 获取所在公会"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# 获取uid对应的工会信息，列表channeList \n",
    "channel_None = {'isPlaintum': None, 'channelNumber': None, 'name': None, 'logo': None, 'isDiamond': None, 'channelId': None}\n",
    "def get_channel(uid):\n",
    "    url = f'https://chgate.huya.com/proxy/index?service=thrift_sign&iface=getSignChannelInfo&callback=getSignChannelInfo&data={uid}'\n",
    "    r = requests.get(url)\n",
    "    jsonData = json.loads(r.text[19:-1])\n",
    "    if jsonData['data']:\n",
    "        jsonData['data']['用户id'] = uid\n",
    "        channelList.append(jsonData['data'])\n",
    "    else:\n",
    "        channel_None['用户id'] = uid\n",
    "        channelList.append(channel_None)\n",
    "\n",
    "channeList = []\n",
    "a = pht['用户id'].apply(get_channel)   # a是为了不输出"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 246,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# 一些数据上的处理，因为刚刚爬少写了channel_None的uid\n",
    "# c = a[~a.用户id.isna()]\n",
    "# drop_index = c[c.duplicated()].sort_values('用户id').index\n",
    "# a.drop(drop_index, inplace=True)\n",
    "\n",
    "# a['用户id'] = pht['用户id'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 261,
   "metadata": {
    "code_folding": [],
    "collapsed": true,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户id</th>\n",
       "      <th>房间号</th>\n",
       "      <th>公会id</th>\n",
       "      <th>昵称</th>\n",
       "      <th>标签</th>\n",
       "      <th>isPlaintum</th>\n",
       "      <th>channelNumber</th>\n",
       "      <th>name</th>\n",
       "      <th>logo</th>\n",
       "      <th>isDiamond</th>\n",
       "      <th>channelId</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1820796294</td>\n",
       "      <td>12214</td>\n",
       "      <td>1820796294</td>\n",
       "      <td>时代-十四剑姬</td>\n",
       "      <td>大神推荐</td>\n",
       "      <td>True</td>\n",
       "      <td>90214.0</td>\n",
       "      <td>时代游戏</td>\n",
       "      <td>https://huyaimg.msstatic.com/cdnimage/channell...</td>\n",
       "      <td>True</td>\n",
       "      <td>14732.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2367547387</td>\n",
       "      <td>333003</td>\n",
       "      <td>2367547387</td>\n",
       "      <td>Zz1tai姿态</td>\n",
       "      <td>大神推荐</td>\n",
       "      <td>True</td>\n",
       "      <td>90999.0</td>\n",
       "      <td>皇族Rstar</td>\n",
       "      <td>https://huyaimg.msstatic.com/cdnimage/channell...</td>\n",
       "      <td>False</td>\n",
       "      <td>67547117.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>1346609715</td>\n",
       "      <td>660000</td>\n",
       "      <td>1346609715</td>\n",
       "      <td>英雄联盟赛事</td>\n",
       "      <td>魅力新星</td>\n",
       "      <td>True</td>\n",
       "      <td>66.0</td>\n",
       "      <td>虎牙游戏</td>\n",
       "      <td>https://huyaimg.msstatic.com/cdnimage/channell...</td>\n",
       "      <td>False</td>\n",
       "      <td>78941969.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>2183525275</td>\n",
       "      <td>572329</td>\n",
       "      <td>2183525275</td>\n",
       "      <td>Dae-北枫c</td>\n",
       "      <td>大神推荐</td>\n",
       "      <td>True</td>\n",
       "      <td>90725.0</td>\n",
       "      <td>招到期主播</td>\n",
       "      <td>https://huyaimg.msstatic.com/cdnimage/channell...</td>\n",
       "      <td>True</td>\n",
       "      <td>16634988.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>2334560814</td>\n",
       "      <td>420342</td>\n",
       "      <td>2334560814</td>\n",
       "      <td>时代-老枪赵信【李弟】</td>\n",
       "      <td>NaN</td>\n",
       "      <td>True</td>\n",
       "      <td>90214.0</td>\n",
       "      <td>时代游戏</td>\n",
       "      <td>https://huyaimg.msstatic.com/cdnimage/channell...</td>\n",
       "      <td>True</td>\n",
       "      <td>14732.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         用户id     房间号        公会id           昵称    标签 isPlaintum  \\\n",
       "0  1820796294   12214  1820796294      时代-十四剑姬  大神推荐       True   \n",
       "1  2367547387  333003  2367547387     Zz1tai姿态  大神推荐       True   \n",
       "2  1346609715  660000  1346609715       英雄联盟赛事  魅力新星       True   \n",
       "3  2183525275  572329  2183525275      Dae-北枫c  大神推荐       True   \n",
       "4  2334560814  420342  2334560814  时代-老枪赵信【李弟】   NaN       True   \n",
       "\n",
       "   channelNumber     name                                               logo  \\\n",
       "0        90214.0     时代游戏  https://huyaimg.msstatic.com/cdnimage/channell...   \n",
       "1        90999.0  皇族Rstar  https://huyaimg.msstatic.com/cdnimage/channell...   \n",
       "2           66.0     虎牙游戏  https://huyaimg.msstatic.com/cdnimage/channell...   \n",
       "3        90725.0    招到期主播  https://huyaimg.msstatic.com/cdnimage/channell...   \n",
       "4        90214.0     时代游戏  https://huyaimg.msstatic.com/cdnimage/channell...   \n",
       "\n",
       "  isDiamond   channelId  \n",
       "0      True     14732.0  \n",
       "1     False  67547117.0  \n",
       "2     False  78941969.0  \n",
       "3      True  16634988.0  \n",
       "4      True     14732.0  "
      ]
     },
     "execution_count": 261,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 合并公会信息\n",
    "# a = pd.DataFrame(channelList)\n",
    "# a.to_csv('用户id-公会数据.csv', index=False)  # 保存公会数据\n",
    "# pht = pd.merge(pht, a, on='用户id', how='left')\n",
    "# pht.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 284,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# pht = pht.rename(columns={'channelNumber':'公会号', 'name':'公会名称', 'isDiamond':'是否为钻石会员', 'channelId':'公会id'})\n",
    "# # pht.drop(columns=['logo', '公会id'], inplace=True)\n",
    "# pht.to_csv('4月4日数据_已提取标签.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 285,
   "metadata": {
    "hidden": true,
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户id</th>\n",
       "      <th>房间号</th>\n",
       "      <th>昵称</th>\n",
       "      <th>标签</th>\n",
       "      <th>isPlaintum</th>\n",
       "      <th>公会号</th>\n",
       "      <th>公会名称</th>\n",
       "      <th>是否为钻石会员</th>\n",
       "      <th>公会id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1820796294</td>\n",
       "      <td>12214</td>\n",
       "      <td>时代-十四剑姬</td>\n",
       "      <td>大神推荐</td>\n",
       "      <td>True</td>\n",
       "      <td>90214.0</td>\n",
       "      <td>时代游戏</td>\n",
       "      <td>True</td>\n",
       "      <td>14732.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2367547387</td>\n",
       "      <td>333003</td>\n",
       "      <td>Zz1tai姿态</td>\n",
       "      <td>大神推荐</td>\n",
       "      <td>True</td>\n",
       "      <td>90999.0</td>\n",
       "      <td>皇族Rstar</td>\n",
       "      <td>False</td>\n",
       "      <td>67547117.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>1346609715</td>\n",
       "      <td>660000</td>\n",
       "      <td>英雄联盟赛事</td>\n",
       "      <td>魅力新星</td>\n",
       "      <td>True</td>\n",
       "      <td>66.0</td>\n",
       "      <td>虎牙游戏</td>\n",
       "      <td>False</td>\n",
       "      <td>78941969.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>2183525275</td>\n",
       "      <td>572329</td>\n",
       "      <td>Dae-北枫c</td>\n",
       "      <td>大神推荐</td>\n",
       "      <td>True</td>\n",
       "      <td>90725.0</td>\n",
       "      <td>招到期主播</td>\n",
       "      <td>True</td>\n",
       "      <td>16634988.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>2334560814</td>\n",
       "      <td>420342</td>\n",
       "      <td>时代-老枪赵信【李弟】</td>\n",
       "      <td>NaN</td>\n",
       "      <td>True</td>\n",
       "      <td>90214.0</td>\n",
       "      <td>时代游戏</td>\n",
       "      <td>True</td>\n",
       "      <td>14732.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11140</td>\n",
       "      <td>2323517119</td>\n",
       "      <td>16531038</td>\n",
       "      <td>伊芙-蕾雅</td>\n",
       "      <td>NaN</td>\n",
       "      <td>True</td>\n",
       "      <td>11720.0</td>\n",
       "      <td>【傲辰集团】</td>\n",
       "      <td>True</td>\n",
       "      <td>11720.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11141</td>\n",
       "      <td>1279532989136</td>\n",
       "      <td>24503499</td>\n",
       "      <td>祈雪咒</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11142</td>\n",
       "      <td>2214316071</td>\n",
       "      <td>23382664</td>\n",
       "      <td>、青栀、</td>\n",
       "      <td>NaN</td>\n",
       "      <td>True</td>\n",
       "      <td>64113.0</td>\n",
       "      <td>九九娱乐</td>\n",
       "      <td>False</td>\n",
       "      <td>64113.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11143</td>\n",
       "      <td>1746647510</td>\n",
       "      <td>21582991</td>\n",
       "      <td>神人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11144</td>\n",
       "      <td>136907042</td>\n",
       "      <td>10265599</td>\n",
       "      <td>迷</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>11145 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                用户id       房间号           昵称    标签 isPlaintum      公会号  \\\n",
       "0         1820796294     12214      时代-十四剑姬  大神推荐       True  90214.0   \n",
       "1         2367547387    333003     Zz1tai姿态  大神推荐       True  90999.0   \n",
       "2         1346609715    660000       英雄联盟赛事  魅力新星       True     66.0   \n",
       "3         2183525275    572329      Dae-北枫c  大神推荐       True  90725.0   \n",
       "4         2334560814    420342  时代-老枪赵信【李弟】   NaN       True  90214.0   \n",
       "...              ...       ...          ...   ...        ...      ...   \n",
       "11140     2323517119  16531038        伊芙-蕾雅   NaN       True  11720.0   \n",
       "11141  1279532989136  24503499          祈雪咒   NaN       None      NaN   \n",
       "11142     2214316071  23382664         、青栀、   NaN       True  64113.0   \n",
       "11143     1746647510  21582991           神人   NaN       None      NaN   \n",
       "11144      136907042  10265599            迷   NaN       None      NaN   \n",
       "\n",
       "          公会名称 是否为钻石会员        公会id  \n",
       "0         时代游戏    True     14732.0  \n",
       "1      皇族Rstar   False  67547117.0  \n",
       "2         虎牙游戏   False  78941969.0  \n",
       "3        招到期主播    True  16634988.0  \n",
       "4         时代游戏    True     14732.0  \n",
       "...        ...     ...         ...  \n",
       "11140   【傲辰集团】    True     11720.0  \n",
       "11141     None    None         NaN  \n",
       "11142     九九娱乐   False     64113.0  \n",
       "11143     None    None         NaN  \n",
       "11144     None    None         NaN  \n",
       "\n",
       "[11145 rows x 9 columns]"
      ]
     },
     "execution_count": 285,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pht"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "###### 个人信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 337,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "|  | 【0.19%】https://www.huya.com/24493416\n",
      "|  | 【0.28%】https://www.huya.com/24467478\n",
      "|  | 【0.53%】https://www.huya.com/24342508\n",
      "|  | 【1.39%】https://www.huya.com/24357894\n",
      "|  | 【2.29%】https://www.huya.com/24462769\n",
      "|  | 【2.36%】https://www.huya.com/24467123\n",
      "|  | 【3.30%】https://www.huya.com/22344614\n",
      "| █ | 【4.24%】https://www.huya.com/19393396\n",
      "| █ | 【5.37%】https://www.huya.com/20807565\n",
      "| █ | 【6.19%】https://www.huya.com/19064631\n",
      "| █ | 【7.07%】https://www.huya.com/24458798\n",
      "| ██ | 【9.32%】https://www.huya.com/23677137\n",
      "| ██ | 【9.74%】https://www.huya.com/24477568\n",
      "| ███ | 【12.44%】https://www.huya.com/13990934\n",
      "| ████ | 【17.81%】https://www.huya.com/21640412\n",
      "| ████ | 【19.65%】https://www.huya.com/24471118\n",
      "| █████ | 【22.86%】https://www.huya.com/24488707\n",
      "| ██████ | 【24.86%】https://www.huya.com/19784046\n",
      "| ██████ | 【26.48%】https://www.huya.com/21982907\n",
      "| ███████ | 【30.20%】https://www.huya.com/24497476\n",
      "| ███████ | 【31.72%】https://www.huya.com/24483038\n",
      "| ███████ | 【31.90%】https://www.huya.com/24108539\n",
      "| ███████ | 【31.92%】https://www.huya.com/24471704\n",
      "| ████████ | 【32.97%】https://www.huya.com/20530064\n",
      "| ████████ | 【32.97%】https://www.huya.com/24488494\n",
      "| ████████ | 【32.98%】https://www.huya.com/24489024\n",
      "| ████████ | 【33.62%】https://www.huya.com/24489030\n",
      "| ████████ | 【33.63%】https://www.huya.com/24482904\n",
      "| ████████ | 【34.40%】https://www.huya.com/24462600\n",
      "| ████████ | 【35.32%】https://www.huya.com/24495164\n",
      "| ████████ | 【35.88%】https://www.huya.com/23611526\n",
      "| █████████ | 【36.92%】https://www.huya.com/24471013\n",
      "| █████████ | 【36.93%】https://www.huya.com/24495138\n",
      "| █████████ | 【37.01%】https://www.huya.com/24472438\n",
      "| █████████████ | 【54.58%】https://www.huya.com/23739871\n",
      "| █████████████ | 【55.50%】https://www.huya.com/22445930\n",
      "| █████████████ | 【55.58%】https://www.huya.com/19610360\n",
      "| ██████████████ | 【56.40%】https://www.huya.com/16136814\n",
      "| ██████████████ | 【57.87%】https://www.huya.com/24472502\n",
      "| ███████████████ | 【62.09%】https://www.huya.com/24491521\n",
      "| ███████████████ | 【62.12%】https://www.huya.com/20254804\n",
      "| ███████████████ | 【62.78%】https://www.huya.com/24495569\n",
      "| ████████████████ | 【67.04%】https://www.huya.com/24452011\n",
      "| █████████████████ | 【68.82%】https://www.huya.com/14558284\n",
      "| ██████████████████ | 【74.15%】https://www.huya.com/24493505\n",
      "| ██████████████████ | 【74.18%】https://www.huya.com/24464681\n",
      "| ███████████████████ | 【76.55%】https://www.huya.com/24469716\n",
      "| ███████████████████ | 【79.48%】https://www.huya.com/24494223\n",
      "| █████████████████████ | 【84.90%】https://www.huya.com/24494164\n",
      "| █████████████████████ | 【85.66%】https://www.huya.com/24487896\n",
      "| ██████████████████████ | 【90.44%】https://www.huya.com/24484206\n",
      "| ██████████████████████ | 【90.44%】https://www.huya.com/24494608\n",
      "| ██████████████████████ | 【90.46%】https://www.huya.com/24488768\n",
      "| ██████████████████████ | 【90.49%】https://www.huya.com/24496646\n",
      "| ██████████████████████ | 【91.05%】https://www.huya.com/24488712\n",
      "| ██████████████████████ | 【91.67%】https://www.huya.com/24483534\n",
      "| ███████████████████████ | 【93.85%】https://www.huya.com/24501877\n",
      "| ████████████████████████ | 【96.76%】https://www.huya.com/24454031\n",
      "| ████████████████████████ | 【96.98%】https://www.huya.com/24488296\n",
      "| ████████████████████████ | 【97.48%】https://www.huya.com/24382440\n",
      "| ████████████████████████ | 【97.66%】https://www.huya.com/19577373\n",
      "| ████████████████████████ | 【97.96%】https://www.huya.com/22374187\n",
      "| ████████████████████████ | 【98.09%】https://www.huya.com/24500849\n",
      "| ████████████████████████ | 【99.99%】"
     ]
    }
   ],
   "source": [
    "pl = pd.DataFrame()\n",
    "profileRoomList = []     # 用来记录不能获取个人信息的房间id\n",
    "for n, profileRoom in enumerate(pht.房间号):\n",
    "    url = f'https://www.huya.com/{profileRoom}'\n",
    "    r = requests.get(url)\n",
    "    selector = Selector(text=r.text)\n",
    "    try:\n",
    "        datas = selector.xpath('/html/body/script[7]').getall()[0]\n",
    "        start_time = re.findall('var TT_META_DATA = {\"start_time\":(.*?)}', datas)[0]       # 时间戳\n",
    "        room_data = json.loads(re.findall('var TT_ROOM_DATA = (.*?})', datas)[0])         # 房间信息\n",
    "        profile_info = json.loads(re.findall('var TT_PROFILE_INFO = (.*?})', datas)[0])   # 主播个人信息\n",
    "        merge_dic = {**profile_info, **room_data}     # 合并字典\n",
    "        pl = pl.append(merge_dic, ignore_index=True)\n",
    "    except:\n",
    "        profileRoomList.append(profileRoom)           # 违规整改的\n",
    "        print(url)\n",
    "    print(\"\\r| \" + \"█\"*int(n/446) + f\" | 【{format(n/11145, '.2%')}】\", end='', flush=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 356,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# pht.drop(index=pht[pht.房间号.isin(profileRoomList)].index, inplace=True)      # 去掉违规整改的主播"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# pl.to_csv('房间信息.csv', index=False)\n",
    "# pl = pl[['sex', 'bussType', 'cameraOpen', 'fans', 'isOn', 'gameFullName', 'isReplay', 'lp', 'recommendStatus', 'startTime', 'type']]\n",
    "# pl = pl.rename(columns={'sex': '性别', 'bussType' : '直播类型', 'cameraOpen': '是否开启摄像头', 'fans': '粉丝数' , 'isOn': '是否在播中', 'gameFullName': '游戏类型', 'isReplay': '是否重播', 'lp': '用户id', 'recommendStatus': '推荐标签', 'startTime':'上次直播时间', 'type': '分类'})\n",
    "# pl['上次直播时间'] = pl.上次直播时间.map(time.localtime).map(lambda x: time.strftime(\"%Y--%m--%d %H:%M:%S\", x)).astype('datetime64')\n",
    "# pht = pht.reset_index(); pht.drop(columns='index', inplace=True) # 重新索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 443,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# 合并为个人信息\n",
    "# pht = pd.merge(pht, pl, on='用户id', how='left')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "###### 爬取公告"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "https://www.huya.com/660000\n",
      "https://www.huya.com/868067\n",
      "https://www.huya.com/660001\n",
      "https://www.huya.com/24493416\n",
      "https://www.huya.com/24467478\n",
      "https://www.huya.com/24357894\n",
      "https://www.huya.com/24462769\n",
      "https://www.huya.com/24467123\n",
      "https://www.huya.com/660113\n",
      "https://www.huya.com/18525045\n",
      "https://www.huya.com/20807565\n",
      "https://www.huya.com/11948626\n",
      "https://www.huya.com/19064631\n",
      "https://www.huya.com/24477568\n",
      "https://www.huya.com/13990934\n",
      "https://www.huya.com/22402642\n",
      "https://www.huya.com/19520860\n",
      "https://www.huya.com/24471118\n",
      "https://www.huya.com/660137\n",
      "https://www.huya.com/24488707\n",
      "https://www.huya.com/21982907\n",
      "https://www.huya.com/24497476\n",
      "https://www.huya.com/23188911\n",
      "https://www.huya.com/24483038\n",
      "https://www.huya.com/24108539\n",
      "https://www.huya.com/24471704\n",
      "https://www.huya.com/24179291\n",
      "https://www.huya.com/20530064\n",
      "https://www.huya.com/24488494\n",
      "https://www.huya.com/24489024\n",
      "https://www.huya.com/24489030\n",
      "https://www.huya.com/24482904\n",
      "https://www.huya.com/24179228\n",
      "https://www.huya.com/24471766\n",
      "https://www.huya.com/24462600\n",
      "https://www.huya.com/24468401\n",
      "https://www.huya.com/24495164\n",
      "https://www.huya.com/23611526\n",
      "https://www.huya.com/641641\n",
      "https://www.huya.com/24471013\n",
      "https://www.huya.com/24495138\n",
      "https://www.huya.com/24472438\n",
      "https://www.huya.com/24468749\n",
      "https://www.huya.com/22445930\n",
      "https://www.huya.com/16136814\n",
      "https://www.huya.com/24472502\n",
      "https://www.huya.com/24305320\n",
      "https://www.huya.com/24491521\n",
      "https://www.huya.com/20254804\n",
      "https://www.huya.com/24495569\n",
      "https://www.huya.com/18524915\n",
      "https://www.huya.com/660139\n",
      "https://www.huya.com/660108\n",
      "https://www.huya.com/660138\n",
      "https://www.huya.com/24495166\n",
      "https://www.huya.com/24209787\n",
      "https://www.huya.com/890001\n",
      "https://www.huya.com/24412848\n",
      "https://www.huya.com/24428253\n",
      "https://www.huya.com/14558284\n",
      "https://www.huya.com/11787847\n",
      "https://www.huya.com/24480797\n",
      "https://www.huya.com/24493505\n",
      "https://www.huya.com/24464681\n",
      "https://www.huya.com/24494223\n",
      "https://www.huya.com/24494164\n",
      "https://www.huya.com/24457242\n",
      "https://www.huya.com/24487896\n",
      "https://www.huya.com/24484206\n",
      "https://www.huya.com/24494608\n",
      "https://www.huya.com/24488768\n",
      "https://www.huya.com/24488712\n",
      "https://www.huya.com/24483534\n",
      "https://www.huya.com/24501877\n",
      "https://www.huya.com/24488296\n",
      "https://www.huya.com/16338628\n",
      "https://www.huya.com/24382440\n",
      "https://www.huya.com/24500849\n"
     ]
    }
   ],
   "source": [
    "def get_announcement(profileRoom):\n",
    "    url = f'https://www.huya.com/{profileRoom}'\n",
    "    r = requests.get(url)\n",
    "    selector = Selector(text=r.text)\n",
    "    try:\n",
    "        return selector.xpath('//*[@id=\"J_profileNoticeText\"]/span/text()').getall()[0]\n",
    "    except:\n",
    "        print(url)\n",
    "        return None\n",
    "pht['公告'] = pht.房间号.apply(get_announcement)\n",
    "pht.to_csv('个人信息.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户id</th>\n",
       "      <th>房间号</th>\n",
       "      <th>公会id</th>\n",
       "      <th>昵称</th>\n",
       "      <th>标签</th>\n",
       "      <th>性别</th>\n",
       "      <th>直播类型</th>\n",
       "      <th>是否开启摄像头</th>\n",
       "      <th>粉丝数</th>\n",
       "      <th>是否在播中</th>\n",
       "      <th>游戏类型</th>\n",
       "      <th>是否重播</th>\n",
       "      <th>推荐标签</th>\n",
       "      <th>上次直播时间</th>\n",
       "      <th>分类</th>\n",
       "      <th>公告</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>62</td>\n",
       "      <td>1149399980</td>\n",
       "      <td>215609</td>\n",
       "      <td>1149399980</td>\n",
       "      <td>李阿特</td>\n",
       "      <td>魅力新星</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>662438.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>英雄联盟</td>\n",
       "      <td>0.0</td>\n",
       "      <td>496.0</td>\n",
       "      <td>1.617541e+09</td>\n",
       "      <td>NORMAL</td>\n",
       "      <td>直播时间   12:00-16:00  21:00-2:00 商务+Q：56155555 添...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          用户id     房间号        公会id   昵称    标签   性别  直播类型  是否开启摄像头       粉丝数  \\\n",
       "62  1149399980  215609  1149399980  李阿特  魅力新星  0.0   1.0      0.0  662438.0   \n",
       "\n",
       "    是否在播中  游戏类型  是否重播   推荐标签        上次直播时间      分类  \\\n",
       "62    0.0  英雄联盟   0.0  496.0  1.617541e+09  NORMAL   \n",
       "\n",
       "                                                   公告  \n",
       "62  直播时间   12:00-16:00  21:00-2:00 商务+Q：56155555 添...  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pht"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "###### 爬取视频动态"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [
     0
    ],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# https://liveapi.huya.com/moment/getMomentListByUidForWeb?callback=jQuery1113007995658170363851_1617714491431&pid=1149399980&uid=0&seed=0&_=1617714491434\n",
    "vi = pd.DataFrame()\n",
    "\n",
    "for uid in tqdm(pht.用户id):\n",
    "    url = f'https://liveapi.huya.com/moment/getMomentListByUidForWeb?callback=jQuery1113007995658170363851_1617714491431&pid={uid}&uid=0&seed=0&_=1617714491434'\n",
    "    r = requests.get(url)\n",
    "    video_info = json.loads(r.text[43:-1])['data']['moments']\n",
    "    try:\n",
    "        vi = vi.append(video_info, ignore_index=True)\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户id</th>\n",
       "      <th>房间号</th>\n",
       "      <th>公会id</th>\n",
       "      <th>昵称</th>\n",
       "      <th>标签</th>\n",
       "      <th>性别</th>\n",
       "      <th>直播类型</th>\n",
       "      <th>是否开启摄像头</th>\n",
       "      <th>粉丝数</th>\n",
       "      <th>是否在播中</th>\n",
       "      <th>游戏类型</th>\n",
       "      <th>是否重播</th>\n",
       "      <th>推荐标签</th>\n",
       "      <th>上次直播时间</th>\n",
       "      <th>分类</th>\n",
       "      <th>公告</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>6531</td>\n",
       "      <td>17303541</td>\n",
       "      <td>351740</td>\n",
       "      <td>17303541</td>\n",
       "      <td>歌y</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3800.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>英雄联盟</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.617606e+09</td>\n",
       "      <td>NORMAL</td>\n",
       "      <td>新主播求订阅～ 下午直播到晚上 感谢关注～</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          用户id     房间号      公会id  昵称   标签   性别  直播类型  是否开启摄像头     粉丝数  是否在播中  \\\n",
       "6531  17303541  351740  17303541  歌y  NaN  2.0   1.0      0.0  3800.0    1.0   \n",
       "\n",
       "      游戏类型  是否重播  推荐标签        上次直播时间      分类                     公告  \n",
       "6531  英雄联盟   0.0   0.0  1.617606e+09  NORMAL  新主播求订阅～ 下午直播到晚上 感谢关注～  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vi = pd.read_csv('视频动态.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "###### 爬取周贡榜"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████| 11145/11145 [46:47<00:00,  3.97it/s]\n"
     ]
    }
   ],
   "source": [
    "# https://www.huya.com/cache5min.php?m=WeekRank&do=getItemsByPid&pid=1149399980\n",
    "wc = pd.DataFrame()\n",
    "\n",
    "for uid in tqdm(pht.用户id):\n",
    "    url = f'https://www.huya.com/cache5min.php?m=WeekRank&do=getItemsByPid&pid={uid}'\n",
    "    r = requests.get(url)\n",
    "    week_contribute = r.json()['data']['vWeekRankItem']\n",
    "    wc_len = len(wc)\n",
    "    try:\n",
    "        wc =  wc.append(week_contribute, ignore_index=True)\n",
    "        wc.loc[wc_len-1:, '用户id'] = uid\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>lUid</th>\n",
       "      <th>sNickName</th>\n",
       "      <th>iScore</th>\n",
       "      <th>iGuardLevel</th>\n",
       "      <th>iNobleLevel</th>\n",
       "      <th>sLogo</th>\n",
       "      <th>iUserLevel</th>\n",
       "      <th>tNobleLevel</th>\n",
       "      <th>iSFFlag</th>\n",
       "      <th>用户id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1786502568</td>\n",
       "      <td>村长洗头用飘柔</td>\n",
       "      <td>1941400</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>https://huyaimg.msstatic.com/avatar/1033/6b/7a...</td>\n",
       "      <td>29</td>\n",
       "      <td>{'iNobleLevel': 2, 'iAttrType': 0}</td>\n",
       "      <td>0</td>\n",
       "      <td>1.820796e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1199515156356</td>\n",
       "      <td>赵茗茗同学</td>\n",
       "      <td>1798200</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>https://huyaimg.msstatic.com/avatar/1031/93/d0...</td>\n",
       "      <td>15</td>\n",
       "      <td>{'iNobleLevel': 5, 'iAttrType': 0}</td>\n",
       "      <td>0</td>\n",
       "      <td>1.820796e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>1698832695</td>\n",
       "      <td>雨滴【小木兰】</td>\n",
       "      <td>657400</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>https://huyaimg.msstatic.com/avatar/1030/30/f1...</td>\n",
       "      <td>16</td>\n",
       "      <td>{'iNobleLevel': 1, 'iAttrType': 0}</td>\n",
       "      <td>0</td>\n",
       "      <td>1.820796e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>1356924630</td>\n",
       "      <td>无痕</td>\n",
       "      <td>503200</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>https://huyaimg.msstatic.com/avatar/1096/d6/cf...</td>\n",
       "      <td>19</td>\n",
       "      <td>{'iNobleLevel': 4, 'iAttrType': 0}</td>\n",
       "      <td>0</td>\n",
       "      <td>1.820796e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>1199573401591</td>\n",
       "      <td>小蘑菇QAQ</td>\n",
       "      <td>498000</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>https://huyaimg.msstatic.com/avatar/1075/5c/91...</td>\n",
       "      <td>2</td>\n",
       "      <td>{'iNobleLevel': 1, 'iAttrType': 0}</td>\n",
       "      <td>0</td>\n",
       "      <td>1.820796e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>45013</td>\n",
       "      <td>1464086656</td>\n",
       "      <td>听说接待姐姐想睡我</td>\n",
       "      <td>5000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>http://huyaimg.msstatic.com/avatar/1060/f6/0b9...</td>\n",
       "      <td>10</td>\n",
       "      <td>{'iNobleLevel': 0, 'iAttrType': 0}</td>\n",
       "      <td>0</td>\n",
       "      <td>2.323517e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>45014</td>\n",
       "      <td>1632134593</td>\n",
       "      <td>似水流年，繁华了谁</td>\n",
       "      <td>3700</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>http://huyaimg.msstatic.com/avatar/1060/2a/572...</td>\n",
       "      <td>6</td>\n",
       "      <td>{'iNobleLevel': 0, 'iAttrType': 0}</td>\n",
       "      <td>0</td>\n",
       "      <td>2.323517e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>45015</td>\n",
       "      <td>1707464702</td>\n",
       "      <td>魔法少女蒙多</td>\n",
       "      <td>1000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>https://huyaimg.msstatic.com/avatar/1075/26/e4...</td>\n",
       "      <td>5</td>\n",
       "      <td>{'iNobleLevel': 0, 'iAttrType': 0}</td>\n",
       "      <td>0</td>\n",
       "      <td>2.214316e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>45016</td>\n",
       "      <td>880783781</td>\n",
       "      <td>玙丶苏苏酱的绯闻男友</td>\n",
       "      <td>33000</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>https://huyaimg.msstatic.com/avatar/1006/95/fa...</td>\n",
       "      <td>25</td>\n",
       "      <td>{'iNobleLevel': 1, 'iAttrType': 0}</td>\n",
       "      <td>0</td>\n",
       "      <td>2.214316e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>45017</td>\n",
       "      <td>1279518128905</td>\n",
       "      <td>兰宝【酱酱】</td>\n",
       "      <td>100</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>https://huyaimg.msstatic.com/avatar/1087/0c/25...</td>\n",
       "      <td>22</td>\n",
       "      <td>{'iNobleLevel': 2, 'iAttrType': 0}</td>\n",
       "      <td>0</td>\n",
       "      <td>2.214316e+09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>45018 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                lUid   sNickName   iScore  iGuardLevel  iNobleLevel  \\\n",
       "0         1786502568     村长洗头用飘柔  1941400            0            2   \n",
       "1      1199515156356       赵茗茗同学  1798200            0            5   \n",
       "2         1698832695     雨滴【小木兰】   657400            0            1   \n",
       "3         1356924630          无痕   503200            0            4   \n",
       "4      1199573401591      小蘑菇QAQ   498000            0            1   \n",
       "...              ...         ...      ...          ...          ...   \n",
       "45013     1464086656   听说接待姐姐想睡我     5000            0            0   \n",
       "45014     1632134593   似水流年，繁华了谁     3700            0            0   \n",
       "45015     1707464702      魔法少女蒙多     1000            0            0   \n",
       "45016      880783781  玙丶苏苏酱的绯闻男友    33000            0            1   \n",
       "45017  1279518128905      兰宝【酱酱】      100            0            2   \n",
       "\n",
       "                                                   sLogo  iUserLevel  \\\n",
       "0      https://huyaimg.msstatic.com/avatar/1033/6b/7a...          29   \n",
       "1      https://huyaimg.msstatic.com/avatar/1031/93/d0...          15   \n",
       "2      https://huyaimg.msstatic.com/avatar/1030/30/f1...          16   \n",
       "3      https://huyaimg.msstatic.com/avatar/1096/d6/cf...          19   \n",
       "4      https://huyaimg.msstatic.com/avatar/1075/5c/91...           2   \n",
       "...                                                  ...         ...   \n",
       "45013  http://huyaimg.msstatic.com/avatar/1060/f6/0b9...          10   \n",
       "45014  http://huyaimg.msstatic.com/avatar/1060/2a/572...           6   \n",
       "45015  https://huyaimg.msstatic.com/avatar/1075/26/e4...           5   \n",
       "45016  https://huyaimg.msstatic.com/avatar/1006/95/fa...          25   \n",
       "45017  https://huyaimg.msstatic.com/avatar/1087/0c/25...          22   \n",
       "\n",
       "                              tNobleLevel  iSFFlag          用户id  \n",
       "0      {'iNobleLevel': 2, 'iAttrType': 0}        0  1.820796e+09  \n",
       "1      {'iNobleLevel': 5, 'iAttrType': 0}        0  1.820796e+09  \n",
       "2      {'iNobleLevel': 1, 'iAttrType': 0}        0  1.820796e+09  \n",
       "3      {'iNobleLevel': 4, 'iAttrType': 0}        0  1.820796e+09  \n",
       "4      {'iNobleLevel': 1, 'iAttrType': 0}        0  1.820796e+09  \n",
       "...                                   ...      ...           ...  \n",
       "45013  {'iNobleLevel': 0, 'iAttrType': 0}        0  2.323517e+09  \n",
       "45014  {'iNobleLevel': 0, 'iAttrType': 0}        0  2.323517e+09  \n",
       "45015  {'iNobleLevel': 0, 'iAttrType': 0}        0  2.214316e+09  \n",
       "45016  {'iNobleLevel': 1, 'iAttrType': 0}        0  2.214316e+09  \n",
       "45017  {'iNobleLevel': 2, 'iAttrType': 0}        0  2.214316e+09  \n",
       "\n",
       "[45018 rows x 10 columns]"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "###### 爬取主播所在地、个人等级、个性签名"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# pht = pd.read_csv(r'D:\\User\\Document\\Documents\\狗熊会项目\\狗熊会_美团商业分析\\虎牙直播分析\\datas\\个人信息.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "城市、省份、个性签名、个人等级"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "city_info = pd.DataFrame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "code_folding": [
     0
    ],
    "hidden": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 省份映射到区域  \n",
    "proDic = {'黑龙江':'东北', '吉林':'东北', '辽宁':'东北',\n",
    "\n",
    "'上海':'华东', '江苏':'华东', '浙江':'华东', '安徽':'华东', '福建':'华东', '江西':'华东', '山东':'华东', '台湾':'华东',\n",
    "\n",
    "'北京':'华北', '天津':'华北', '山西':'华北', '河北':'华北', '内蒙古':'华北',  \n",
    "\n",
    "'河南':'华中', '湖北':'华中', '湖南':'华中',  \n",
    "\n",
    "'广东':'华南', '广西':'华南', '海南':'华南', '香港':'华南', '澳门':'华南',  \n",
    "\n",
    "'四川':'西南', '贵州':'西南', '云南':'西南', '重庆':'西南', '西藏':'西南',  \n",
    "\n",
    "'陕西':'西北', '甘肃':'西北', '青海':'西北', '宁夏':'西北', '新疆':'西北',\n",
    "'海外':'海外', np.nan:'未知'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "107524968\n",
      "2450085830\n",
      "347550312\n",
      "303323372\n",
      "2388285699\n",
      "2374052354\n",
      "1279529734282\n",
      "1015645610\n",
      "1570708927\n",
      "1495460806\n",
      "1199573451995\n",
      "1030037565\n",
      "1834012273\n",
      "2277231098\n",
      "1199558241439\n",
      "1199560414273\n",
      "1199513178041\n",
      "1199524193232\n",
      "592241638\n",
      "94911822\n",
      "2252636076\n",
      "1259522948495\n",
      "83149793\n",
      "1576072743\n",
      "1022868157\n",
      "114262549\n",
      "986050574\n",
      "73453417\n",
      "188258209\n",
      "108894026\n",
      "543795422\n",
      "1704946509\n",
      "1858099093\n",
      "1279520994004\n",
      "1199530673939\n",
      "1459592890\n",
      "1279525372411\n",
      "2200520220\n",
      "1740058409\n",
      "1124745781\n",
      "495712680\n",
      "260290307\n",
      "135874020\n",
      "1352454575\n",
      "1812339109\n",
      "2316188108\n",
      "106391116\n",
      "2208644642\n",
      "1259556253588\n",
      "1431893932\n",
      "369376926\n",
      "362689823\n",
      "102338472\n",
      "2362748051\n",
      "1655502494\n",
      "1199526652788\n",
      "1199553548170\n",
      "1259517213255\n",
      "1715302829\n",
      "2298111093\n",
      "1144822605\n",
      "1145672980\n",
      "903458764\n",
      "1734704253\n",
      "1279518041821\n",
      "1584201529\n",
      "1487821463\n",
      "1199572365388\n",
      "1727757254\n",
      "1199534080132\n",
      "1259524841524\n",
      "1259524798885\n",
      "1641010897\n",
      "1451958260\n",
      "1199527080692\n",
      "59886479\n",
      "899516667\n",
      "975332933\n",
      "1724537603\n",
      "1512791977\n",
      "141378883\n",
      "1419256277\n",
      "1055317444\n",
      "1533535164\n",
      "1662939631\n",
      "321413676\n",
      "801618774\n",
      "139134273\n",
      "1199557715527\n",
      "1705461883\n",
      "1540525872\n",
      "637019265\n",
      "2377182813\n",
      "8184919\n",
      "689208\n",
      "2235092354\n",
      "1199568161828\n",
      "1354960957\n",
      "1751281769\n",
      "1769406443\n",
      "1635160935\n",
      "1860272136\n",
      "1706213099\n",
      "2373814509\n",
      "1279515849393\n",
      "1634862283\n",
      "1717494633\n",
      "2255840386\n",
      "1628542632\n",
      "907744708\n",
      "1506173386\n",
      "1661851304\n",
      "1461042572\n",
      "1167372370\n",
      "1186647107\n",
      "1259522164252\n",
      "2208016438\n",
      "1279518587923\n",
      "2382850939\n",
      "2224479118\n",
      "1345624381\n",
      "1259518544904\n",
      "996611540\n",
      "1279521363966\n",
      "1615666377\n",
      "1279525508455\n",
      "826078335\n",
      "171295889\n",
      "118445494\n",
      "1199536328984\n",
      "2245199045\n",
      "1279517611153\n",
      "160329907\n",
      "66498139\n",
      "1686917134\n",
      "90995304\n",
      "1611631661\n",
      "1620248941\n",
      "175426863\n",
      "1553712706\n",
      "1156308134\n",
      "1690752838\n",
      "1055134890\n",
      "1167396698\n",
      "1755478822\n",
      "2255547483\n",
      "1199532159659\n",
      "1273522180\n",
      "1199527960999\n",
      "1199514101541\n",
      "1082673\n",
      "159672117\n",
      "1279517375920\n",
      "1199574067849\n",
      "80140033\n",
      "1199531629302\n",
      "1279516514344\n",
      "2258627083\n",
      "1279519952165\n",
      "1492947084\n",
      "78839596\n",
      "156844042\n",
      "1701781473\n",
      "1620337097\n",
      "958161582\n",
      "2380258749\n",
      "1708881430\n",
      "1199570194973\n",
      "1302395666\n",
      "1279528989523\n",
      "1279521278058\n",
      "2183455199\n",
      "2191902859\n",
      "128266374\n",
      "2179461579\n",
      "1259527896681\n",
      "1827825776\n",
      "1685185991\n",
      "1259517121283\n",
      "1199564791671\n",
      "1430103179\n",
      "733205687\n",
      "192402813\n",
      "1259512472199\n",
      "881777466\n",
      "22361661\n",
      "620690284\n",
      "1687276060\n",
      "123724163\n",
      "1259533003088\n",
      "100354831\n",
      "1461333424\n",
      "1455350001\n",
      "1199091748\n",
      "823009976\n",
      "1199574239640\n",
      "1107689703\n",
      "94258703\n",
      "1571913911\n",
      "8685487\n",
      "1853721810\n",
      "1364374125\n",
      "1451506195\n",
      "21981923\n",
      "1734693394\n",
      "1044775100\n",
      "1158739669\n",
      "15862445\n",
      "564602642\n"
     ]
    }
   ],
   "source": [
    "for uid in pht[pht.省份.map(proDic).isna()].用户id:\n",
    "    url = f'https://q.huya.com/message/index.php?m=Message&do=getUserExtendInfo&uid={uid}&callback=jsonp_021539682112189662'\n",
    "    r = requests.get(url)\n",
    "    dic = json.loads(r.text[25:-2])['data']\n",
    "    dic['uid'] = uid\n",
    "    city_info = city_info.append(dic, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "city_info.drop_duplicates('uid', inplace=True)    # 去掉重复着"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "hidden": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for uid in city_info.uid:\n",
    "    try:\n",
    "        pht.loc[pht[pht.用户id==uid].index,['城市','省份']] = city_info[city_info.uid==uid][['city','province']].values\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>province</th>\n",
       "      <th>signature</th>\n",
       "      <th>uid</th>\n",
       "      <th>userLevel</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>澳门</td>\n",
       "      <td>特别行政区</td>\n",
       "      <td>新浪微博:上单主播黑店百地 20:00~2:00</td>\n",
       "      <td>107524968</td>\n",
       "      <td>26.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>香港</td>\n",
       "      <td>香港</td>\n",
       "      <td>当代陶渊明</td>\n",
       "      <td>-2147483648</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>台湾</td>\n",
       "      <td>台湾</td>\n",
       "      <td>谢谢有你们</td>\n",
       "      <td>347550312</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>香港</td>\n",
       "      <td>香港</td>\n",
       "      <td>“人来人往 勿失勿忘”</td>\n",
       "      <td>303323372</td>\n",
       "      <td>33.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>香港</td>\n",
       "      <td>香港</td>\n",
       "      <td>没钱 别圈</td>\n",
       "      <td>1015645610</td>\n",
       "      <td>12.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>387</td>\n",
       "      <td>台湾</td>\n",
       "      <td>台湾</td>\n",
       "      <td>你没听过他的故事，怎么了解他的为人🌴</td>\n",
       "      <td>1259517121283</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>388</td>\n",
       "      <td>银川</td>\n",
       "      <td>宁夏</td>\n",
       "      <td>接-陪玩-定位-上段-直接加我</td>\n",
       "      <td>1199564791671</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>392</td>\n",
       "      <td>海口</td>\n",
       "      <td>海南</td>\n",
       "      <td></td>\n",
       "      <td>1259512472199</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>398</td>\n",
       "      <td>黄南</td>\n",
       "      <td>青海</td>\n",
       "      <td></td>\n",
       "      <td>1259533003088</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>404</td>\n",
       "      <td>海东</td>\n",
       "      <td>青海</td>\n",
       "      <td>深渊打手</td>\n",
       "      <td>1199574239640</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>210 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    city province                 signature            uid  userLevel\n",
       "0     澳门    特别行政区  新浪微博:上单主播黑店百地 20:00~2:00      107524968       26.0\n",
       "1     香港       香港                     当代陶渊明    -2147483648        5.0\n",
       "2     台湾       台湾                     谢谢有你们      347550312       14.0\n",
       "3     香港       香港               “人来人往 勿失勿忘”      303323372       33.0\n",
       "7     香港       香港                     没钱 别圈     1015645610       12.0\n",
       "..   ...      ...                       ...            ...        ...\n",
       "387   台湾       台湾        你没听过他的故事，怎么了解他的为人🌴  1259517121283        5.0\n",
       "388   银川       宁夏           接-陪玩-定位-上段-直接加我  1199564791671        3.0\n",
       "392   海口       海南                            1259512472199        4.0\n",
       "398   黄南       青海                            1259533003088        2.0\n",
       "404   海东       青海                      深渊打手  1199574239640        2.0\n",
       "\n",
       "[210 rows x 5 columns]"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "city_info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "city_info.to_csv('省份_签名_等级_部分数据.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "pht = pd.read_csv(r'D:\\User\\Document\\Documents\\狗熊会项目\\狗熊会_美团商业分析\\虎牙直播分析\\datas\\个人信息.csv')\n",
    "pht = pht.join(city_info)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "hidden": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "pht.to_csv(r'D:\\User\\Document\\Documents\\狗熊会项目\\狗熊会_美团商业分析\\虎牙直播分析\\datas\\个人信息.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "###### 爬取主播等级、周排名、本周成长值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# rank = pd.DataFrame()\n",
    "# for uid in tqdm(pht.用户id):\n",
    "#     url = f'https://q.huya.com/yy/?m=ProfileLevel&do=getPresenterGrowRank&callback=jQuery1830514955527579928_1618016899702&pid={uid}'\n",
    "#     r = requests.get(url)\n",
    "#     rank = rank.append(json.loads(r.text[40:-1])['data']['tGrowItem'], ignore_index=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "已爬取完毕，直接合并即可"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>iLevel</th>\n",
       "      <th>iLightUp</th>\n",
       "      <th>iLiveState</th>\n",
       "      <th>iRoomId</th>\n",
       "      <th>lPid</th>\n",
       "      <th>sAction</th>\n",
       "      <th>sFaceURL</th>\n",
       "      <th>sNickname</th>\n",
       "      <th>tGrowInfo</th>\n",
       "      <th>lWeeklyExp</th>\n",
       "      <th>lWeeklyIncExp</th>\n",
       "      <th>iRank</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>39.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1820796294</td>\n",
       "      <td>NaN</td>\n",
       "      <td>https://huyaimg.msstatic.com/avatar/1091/31/56...</td>\n",
       "      <td>时代-十四剑姬</td>\n",
       "      <td>{'lWeeklyExp': '360000', 'lWeeklyIncExp': '305...</td>\n",
       "      <td>360000</td>\n",
       "      <td>305161</td>\n",
       "      <td>848</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2367547387</td>\n",
       "      <td>NaN</td>\n",
       "      <td>https://huyaimg.msstatic.com/avatar/1076/84/3e...</td>\n",
       "      <td>Zz1tai姿态</td>\n",
       "      <td>{'lWeeklyExp': '155000', 'lWeeklyIncExp': '435...</td>\n",
       "      <td>155000</td>\n",
       "      <td>43563</td>\n",
       "      <td>5283</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   iLevel  iLightUp  iLiveState  iRoomId        lPid sAction  \\\n",
       "0    39.0       1.0         0.0      0.0  1820796294     NaN   \n",
       "1    35.0       0.0         0.0      0.0  2367547387     NaN   \n",
       "\n",
       "                                            sFaceURL sNickname  \\\n",
       "0  https://huyaimg.msstatic.com/avatar/1091/31/56...   时代-十四剑姬   \n",
       "1  https://huyaimg.msstatic.com/avatar/1076/84/3e...  Zz1tai姿态   \n",
       "\n",
       "                                           tGrowInfo  lWeeklyExp  \\\n",
       "0  {'lWeeklyExp': '360000', 'lWeeklyIncExp': '305...      360000   \n",
       "1  {'lWeeklyExp': '155000', 'lWeeklyIncExp': '435...      155000   \n",
       "\n",
       "   lWeeklyIncExp  iRank  \n",
       "0         305161    848  \n",
       "1          43563   5283  "
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rank = pd.read_csv('周排名_等级_成长值.csv')\n",
    "rank.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "rank = rank.rename(columns={'lPid':'用户id','iLevel':'主播等级', 'iLightUp':'是否点亮徽章', 'lWeeklyIncExp':'本周成长值','iRank':'本周排名'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户id</th>\n",
       "      <th>主播等级</th>\n",
       "      <th>是否点亮徽章</th>\n",
       "      <th>本周成长值</th>\n",
       "      <th>本周排名</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1820796294</td>\n",
       "      <td>39.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>305161</td>\n",
       "      <td>848</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2367547387</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>43563</td>\n",
       "      <td>5283</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         用户id  主播等级  是否点亮徽章   本周成长值  本周排名\n",
       "0  1820796294  39.0     1.0  305161   848\n",
       "1  2367547387  35.0     0.0   43563  5283"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = rank[['用户id','主播等级','是否点亮徽章','本周成长值','本周排名']]\n",
    "a.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# pht = pd.read_csv(r'D:\\User\\Document\\Documents\\狗熊会项目\\狗熊会_美团商业分析\\虎牙直播分析\\datas\\个人信息.csv')\n",
    "pht = pd.merge(pht, a, on='用户id', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户id</th>\n",
       "      <th>昵称</th>\n",
       "      <th>标签</th>\n",
       "      <th>公告</th>\n",
       "      <th>公会级别</th>\n",
       "      <th>公会规模</th>\n",
       "      <th>上次直播时间</th>\n",
       "      <th>房间号</th>\n",
       "      <th>性别</th>\n",
       "      <th>直播类型</th>\n",
       "      <th>...</th>\n",
       "      <th>最高人气</th>\n",
       "      <th>平均小时人气</th>\n",
       "      <th>city</th>\n",
       "      <th>province</th>\n",
       "      <th>signature</th>\n",
       "      <th>userLevel</th>\n",
       "      <th>主播等级</th>\n",
       "      <th>是否点亮徽章</th>\n",
       "      <th>本周成长值</th>\n",
       "      <th>本周排名</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1820796294</td>\n",
       "      <td>时代-十四剑姬</td>\n",
       "      <td>大神推荐</td>\n",
       "      <td>直播时间晚上9点播到凌晨4点</td>\n",
       "      <td>钻石公会</td>\n",
       "      <td>中</td>\n",
       "      <td>2021-04-05 21:03:49</td>\n",
       "      <td>12214</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>3180593</td>\n",
       "      <td>2560319</td>\n",
       "      <td>成都</td>\n",
       "      <td>四川</td>\n",
       "      <td></td>\n",
       "      <td>6.0</td>\n",
       "      <td>39.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>305161</td>\n",
       "      <td>848</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2367547387</td>\n",
       "      <td>Zz1tai姿态</td>\n",
       "      <td>大神推荐</td>\n",
       "      <td>003带你小游峡谷~每天25点准时开播~再帮小弟卡个牌子爱死你们了！</td>\n",
       "      <td>白金公会</td>\n",
       "      <td>小</td>\n",
       "      <td>2021-04-04 22:32:57</td>\n",
       "      <td>333003</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>3874036</td>\n",
       "      <td>2948839</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>9.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>43563</td>\n",
       "      <td>5283</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 60 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         用户id        昵称    标签                                  公告  公会级别 公会规模  \\\n",
       "0  1820796294   时代-十四剑姬  大神推荐                      直播时间晚上9点播到凌晨4点  钻石公会    中   \n",
       "1  2367547387  Zz1tai姿态  大神推荐  003带你小游峡谷~每天25点准时开播~再帮小弟卡个牌子爱死你们了！  白金公会    小   \n",
       "\n",
       "                上次直播时间     房间号   性别  直播类型  ...     最高人气   平均小时人气  city  \\\n",
       "0  2021-04-05 21:03:49   12214  1.0   1.0  ...  3180593  2560319    成都   \n",
       "1  2021-04-04 22:32:57  333003  2.0   1.0  ...  3874036  2948839         \n",
       "\n",
       "   province  signature  userLevel  主播等级  是否点亮徽章   本周成长值  本周排名  \n",
       "0        四川                   6.0  39.0     1.0  305161   848  \n",
       "1                             9.0  35.0     0.0   43563  5283  \n",
       "\n",
       "[2 rows x 60 columns]"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pht.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "pht = pht.rename(columns={'city':'城市','province':'省份','signature':'签名','userLevel':'个人等级'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "pht.to_csv(r'D:\\User\\Document\\Documents\\狗熊会项目\\狗熊会_美团商业分析\\虎牙直播分析\\datas\\个人信息.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### 爬取热门英雄、英雄位置"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "###### 热门英雄"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# 网址、表单、请求头  \n",
    "url = 'https://www.op.gg/statistics/ajax2/champion/'\n",
    "json = {'type':'picked',\n",
    "'league':'',\n",
    "'period':'month',\n",
    "'mapId':1,\n",
    "'queue':'ranked'}\n",
    "\n",
    "headers = {\n",
    "    'Host': 'www.op.gg',\n",
    "'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0',\n",
    "'Accept': '*/*',\n",
    "'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',\n",
    "'Accept-Encoding': 'gzip, deflate, br',\n",
    "'Content-Type': 'application/json; charset=UTF-8',\n",
    "'X-Requested-With': 'XMLHttpRequest',\n",
    "'Content-Length': '53',\n",
    "'Origin': 'https://www.op.gg',\n",
    "'Connection': 'keep-alive',\n",
    "'Referer': 'https://www.op.gg/statistics/champion/',\n",
    "'Cookie': '_ga=GA1.1.632153092.1601615811; _ga_HKZFKE5JEL=GS1.1.1618149497.5.1.1618149703.0; wcs_bt=55c48ac9e22bec:1618149705; Hm_lvt_29884b6641f1b5709cc89a8ce5a99366=1618149502; _hist=%EA%B9%80%EC%B9%98%EB%A7%B9%24%EC%A1%B0%ED%86%A0%EB%AF%BC%24%EC%B1%84%ED%8C%85%EC%95%88%EC%B9%98%EB%8A%94%EC%86%8C%EB%82%98; __gads=ID=023d96cfc44a6faf-2281a6dcd0c300b2:T=1601616021:S=ALNI_MbomlUhbLVKMdk7Seg2SxRHWfELCQ; cto_bundle=wDULZ19KaENsTWFhUllJQ3Z5dThkRUJkZVA2TnNHY2JpZjdTTHFqdXdlZURUcXpYTmVTM3B0WThzckxEUzQ5WDJFanpxMFdBcGNHOUp5aFNLRzhieHh6c0hNUEpHbnpxeWxzQSUyQjJWbHd0dTQ3Zzg4TEglMkJoOW9PaGxIRk1WblA4JTJCNEVzMVlsVGN2T2FSUkd0TU5Ua3ZCY0N5WVElM0QlM0Q; _dd_s=rum=0&expire=1618150604554; Hm_lpvt_29884b6641f1b5709cc89a8ce5a99366=1618149705; _gid=GA1.2.799906881.1618149504; _gat_gtag_UA_37377845_1=1; _gat_gtag_UA_140073778_1=1',\n",
    "'Pragma': 'no-cache',\n",
    "'Cache-Control': 'no-cache',\n",
    "'TE': 'Trailers',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "r = requests.post(url, json=json, headers=headers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 236,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['虚空之女',\n",
       " '探险家',\n",
       " '盲僧',\n",
       " '暴走萝莉',\n",
       " '暗夜猎手',\n",
       " '魂锁典狱长',\n",
       " '解脱者',\n",
       " '涤魂圣枪',\n",
       " '沙漠玫瑰',\n",
       " '麦林炮手',\n",
       " '战争之影',\n",
       " '破败之王',\n",
       " '疾风剑豪',\n",
       " '圣枪游侠',\n",
       " '熔岩巨兽',\n",
       " '封魔剑魂',\n",
       " '刀锋舞者',\n",
       " '皮城女警',\n",
       " '光辉女郎',\n",
       " '离群之刺',\n",
       " '曙光女神',\n",
       " '影流之主',\n",
       " '腕豪',\n",
       " '蒸汽机器人',\n",
       " '堕落天使',\n",
       " '不屈之枪',\n",
       " '魔法猫咪',\n",
       " '荒漠屠夫',\n",
       " '法外狂徒',\n",
       " '恶魔小丑',\n",
       " '狂野女猎手',\n",
       " '牛头酋长',\n",
       " '仙灵女巫',\n",
       " '战争女神',\n",
       " '远古巫灵',\n",
       " '蜘蛛女皇',\n",
       " '正义巨像',\n",
       " '时间刺客',\n",
       " '不灭狂雷',\n",
       " '深海泰坦',\n",
       " '寒冰射手',\n",
       " '无极剑圣',\n",
       " '德玛西亚之力',\n",
       " '刀锋之影',\n",
       " '德邦总管',\n",
       " '永恒梦魇',\n",
       " '不祥之刃',\n",
       " '暗裔剑魔',\n",
       " '迷失之牙',\n",
       " '戏命师',\n",
       " '血港鬼影',\n",
       " '诺克萨斯之手',\n",
       " '无双剑姬',\n",
       " '卡牌大师',\n",
       " '星籁歌姬',\n",
       " '影流之镰',\n",
       " '兽灵行者',\n",
       " '赏金猎人',\n",
       " '祖安狂人',\n",
       " '扭曲树精',\n",
       " '圣锤之毅',\n",
       " '九尾妖狐',\n",
       " '武器大师',\n",
       " '山隐之焰',\n",
       " '诡术妖姬',\n",
       " '未来守护者',\n",
       " '酒桶',\n",
       " '迅捷斥候',\n",
       " '含羞蓓蕾',\n",
       " '沙漠死神',\n",
       " '祖安怒兽',\n",
       " '暮光之眼',\n",
       " '放逐之刃',\n",
       " '诺克萨斯统领',\n",
       " '潮汐海灵',\n",
       " '雪原双子',\n",
       " '永猎双子',\n",
       " '复仇之矛',\n",
       " '青钢影',\n",
       " '铁铠冥魂',\n",
       " '机械先驱',\n",
       " '披甲龙龟',\n",
       " '符文法师',\n",
       " '虚空掠夺者',\n",
       " '天启者',\n",
       " '元素女皇',\n",
       " '惩戒之箭',\n",
       " '生化魔人',\n",
       " '冰晶凤凰',\n",
       " '发条魔灵',\n",
       " '亡灵战神',\n",
       " '残月之肃',\n",
       " '复仇焰魂',\n",
       " '暮光星灵',\n",
       " '痛苦之拥',\n",
       " '猩红收割者',\n",
       " '镕铁少女',\n",
       " '死亡颂唱者',\n",
       " '逆羽',\n",
       " '万花通灵',\n",
       " '众星之子',\n",
       " '远古恐惧',\n",
       " '齐天大圣',\n",
       " '牧魂人',\n",
       " '机械公敌',\n",
       " '德玛西亚皇子',\n",
       " '时光守护者',\n",
       " '皎月女神',\n",
       " '虚空恐惧',\n",
       " '邪恶小法师',\n",
       " '海洋之灾',\n",
       " '傲之追猎者',\n",
       " '虚空行者',\n",
       " '德玛西亚之翼',\n",
       " '海兽祭司',\n",
       " '狂战士',\n",
       " '皮城执法官',\n",
       " '暗黑元首',\n",
       " '幻翎',\n",
       " '荆棘之兴',\n",
       " '虚空之眼',\n",
       " '虚空遁地兽',\n",
       " '瘟疫之源',\n",
       " '北地之怒',\n",
       " '殇之木乃伊',\n",
       " '蛮族之王',\n",
       " '大发明家',\n",
       " '岩雀',\n",
       " '狂暴之心',\n",
       " '虚空先知',\n",
       " '无畏战车',\n",
       " '魔蛇之拥',\n",
       " '黑暗之女',\n",
       " '龙血武姬',\n",
       " '风暴之怒',\n",
       " '唤潮鲛姬',\n",
       " '暴怒骑士',\n",
       " '星界游神',\n",
       " '荣耀行刑官',\n",
       " '巨魔之王',\n",
       " '深渊巨口',\n",
       " '正义天使',\n",
       " '炼金术士',\n",
       " '弗雷尔卓德之心',\n",
       " '英勇投弹手',\n",
       " '河流之王',\n",
       " '琴瑟仙女',\n",
       " '沙漠皇帝',\n",
       " '冰霜女巫',\n",
       " '爆破鬼才',\n",
       " '水晶先锋',\n",
       " '翠神',\n",
       " '瓦洛兰之盾',\n",
       " '铸星龙王']"
      ]
     },
     "execution_count": 236,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selector = Selector(r.text)\n",
    "\n",
    "leauge = selector.xpath('//table//tr/td[3]/a/text()').getall()\n",
    "leauge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 340,
   "metadata": {
    "hidden": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# 154个英雄 前30个为热门英雄。后30位冷门\n",
    "legDic = {}\n",
    "for n, key in enumerate(leauge):\n",
    "    if n <= 50:\n",
    "        legDic[key] = 1\n",
    "    else:\n",
    "        legDic[key] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 341,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "with open(r'D:\\User\\Document\\Documents\\狗熊会项目\\狗熊会_美团商业分析\\虎牙直播分析\\datas\\热门英雄字典.txt', 'w') as file:\n",
    "    file.write(str(legDic))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### 英雄位置"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 261,
   "metadata": {},
   "outputs": [],
   "source": [
    "url = 'https://www.op.gg/champion/statistics'\n",
    "headers = {'accept-language': 'zh-CN,zh;q=0.9'}\n",
    "r = requests.get(url, headers=headers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 300,
   "metadata": {},
   "outputs": [],
   "source": [
    "selector = Selector(r.text)\n",
    "# /html/body/div[2]/div[2]/div[2]/div[1]/div[5]/div[3]/div[1]/a/div[1]\n",
    "legRank = selector.xpath('//a/div/text()').getall()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 333,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 以第一个位置为准\n",
    "legRankDic = {}\n",
    "for key,value in zip(legRank[153:-1],legRank[154:]):\n",
    "    if len(key)<10:\n",
    "        legRankDic[key] = value.split()[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 335,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存数据\n",
    "with open('英雄位置.txt', 'w') as f:\n",
    "    f.write(str(legRankDic))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "#### 每隔半小时爬取一次，爬取24小时的英雄联盟专区在播直播间信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# ht44 = pd.read_csv('4月4日数据.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "code_folding": [
     0
    ],
    "hidden": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def get_live_pages(df, conti=True):\n",
    "    '''\n",
    "    爬取当前英雄联盟专区直播页面的在播主播人气等信息\n",
    "    df: 一个用于存放信息的dataframe（可以是空的dataframe）\n",
    "    conti：是否每半小时爬取一次，False：只爬取当前信息这一次\n",
    "    '''\n",
    "    try:\n",
    "        while True:\n",
    "            total_page = 5; page = 1\n",
    "            while page <= total_page:\n",
    "                url = f'https://www.huya.com/cache.php?m=LiveList&do=getLiveListByPage&gameId=1&tagAll=0&callback=getLiveListJsonpCallback&page={page}'\n",
    "                r = requests.get(url)\n",
    "                b = json.loads(r.text[25:-1])\n",
    "                get_time = b['data']['time']\n",
    "                a = b['data']['datas']\n",
    "                try:\n",
    "                    df = df.append(a, ignore_index=True)\n",
    "                    df['爬取时间'] = get_time\n",
    "                    print('\\r'+f'成功爬取第{page}页，共{total_page}页', end='')\n",
    "                except:\n",
    "                    print(f'爬取第{page}页失败')\n",
    "                if total_page < b['data']['totalPage']:\n",
    "                    total_page = b['data']['totalPage']\n",
    "                else:\n",
    "                    pass\n",
    "                page += 1\n",
    "            now = time.localtime(time.time())\n",
    "            print('爬取时间：',[i for i in now[:-4]])\n",
    "\n",
    "            # 爬取个人信息\n",
    "            if conti==True:\n",
    "                for i in range(120):     # 半小时后爬取\n",
    "                    time.sleep(15) \n",
    "            else:\n",
    "                return df\n",
    "                break\n",
    "    except:\n",
    "        return df\n",
    "\n",
    "# if __name__ == '__init__':\n",
    "#     ht424 = pd.DataFrame()\n",
    "#     ht424 = get_live_pages(ht424, conti=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\software\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "a['totalCount'] = a.totalCount.astype(int)    # totalCount 转为数值格式 \n",
    "hostSample15 = a[a.totalCount>300000][a.recommendTagName!='赛事精选'].sort_values('totalCount', ascending=False).head(15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "https://www.huya.com/100\n",
      "https://www.huya.com/518518\n",
      "https://www.huya.com/230023\n",
      "https://www.huya.com/941103\n",
      "https://www.huya.com/641641\n",
      "https://www.huya.com/660000\n",
      "https://www.huya.com/912360\n",
      "https://www.huya.com/575757\n",
      "https://www.huya.com/526520\n",
      "https://www.huya.com/972652\n",
      "https://www.huya.com/981585\n",
      "https://www.huya.com/891464\n",
      "https://www.huya.com/24591072\n",
      "https://www.huya.com/417019\n",
      "https://www.huya.com/100953\n"
     ]
    }
   ],
   "source": [
    "for rid in hostSample15.profileRoom:\n",
    "    print(f'https://www.huya.com/{rid}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "ht422 = pd.read_csv('4月22日数据.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.7",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
